# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import pycountry
import os
import numpy as np
import pandas as pd
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')
for dirname, _, filenames in os.walk('/kaggle/input'):
for filename in filenames:
print(os.path.join(dirname, filename))
# Any results you write to the current directory are saved as output.
# import train data and understand the data
df_train=pd.read_csv("train.csv")
df_train.head()
df_train.sample(6)
# Number of rows and columns
df_train.shape
# Columns names
df_train.columns
# Type of columns
df_train.dtypes
# Do the same thing on the test data
df_test=pd.read_csv("test.csv")
df_test.head()
df_test.shape
df_test.columns
# Names of the countries
df_train["Country_Region"].unique()
len(df_train["Country_Region"].unique())
df_train["Country_Region"].value_counts()
df = df_train[df_train['Date']=="2020-05-13"].fillna('NA').groupby(['Country_Region','Province_State','Date'])['ConfirmedCases'].sum().groupby(['Country_Region','Province_State']).max().sort_values().groupby(['Country_Region']).sum().sort_values(ascending=False)
top_conf_count = pd.DataFrame(df)
top_conf_count=top_conf_count.reset_index()
fig_reg = px.bar(top_conf_count.head(20), x='Country_Region', y='ConfirmedCases')
fig_reg.show()
df = df_train.fillna('NA').groupby(['Country_Region','Province_State','Date'])['ConfirmedCases'].sum().groupby(['Country_Region','Province_State']).max().sort_values().groupby(['Country_Region']).sum().sort_values(ascending=False)
top_conf_count = pd.DataFrame(df)
top_conf_count1 = pd.DataFrame(df.head(10))
fig_reg = px.bar(top_conf_count1,x=top_conf_count1.index, y='ConfirmedCases',color='ConfirmedCases')
fig_reg.update_layout(
title="Confirmed Cases by Country",
xaxis_title=" Countries",
yaxis_title="numbre of Confirmed Cases ",
)
fig_reg.show()
top_conf_count.head(10)
df_d = df_train.fillna('NA').groupby(['Country_Region','Province_State','Date'])['Fatalities'].sum().groupby(['Country_Region','Province_State']).max().sort_values().groupby(['Country_Region']).sum().sort_values(ascending=False)
top_death_count = pd.DataFrame(df_d)
top_death_count
top_death_count1 = pd.DataFrame(df_d.head(10))
fig_reg_fat = px.bar(top_death_count1,x=top_death_count1.index, y='Fatalities',color='Fatalities')
fig_reg_fat.update_layout(
title="Fatalities by Country",
xaxis_title=" Countries",
yaxis_title="numbre of Fatalities ",
)
fig_reg_fat.show()
# Ordrening the countries by number of fatalities
top_count = pd.concat([top_conf_count , top_death_count],axis=1)
top_count = top_count.sort_values(['ConfirmedCases'],ascending=False)[:10]
top_count
import plotly.graph_objects as go
fig = go.Figure(data=[
go.Bar(name='ConfirmedCases',x=top_count.index, y=top_count['ConfirmedCases']),
go.Bar(name='Fatalities',x=top_count.index, y=top_count['Fatalities'])
])
# Change the bar mode
fig.update_layout(barmode='group',title="Confirmed Cases and Fatalities by Country",
xaxis_title=" Countries",
yaxis_title="number of Confirmed Cases and Fatalities ",)
fig.show()
temp_df = df_train.loc[df_train["Date"]=="2020-05-13"].groupby(['Country_Region'])["ConfirmedCases","Fatalities"].sum().reset_index()
temp=temp_df.sort_values(by="ConfirmedCases",ascending=True)
fig = go.Figure(data=[go.Pie(labels=temp["Country_Region"], values=temp["ConfirmedCases"],hole=0.1)])
fig.show()
Total = pd.DataFrame()
Total["Index"] = ["Confirmed Cases","Fatalities"]
Total["Values"] = [df_train.loc[df_train["Date"]=="2020-05-13"]["ConfirmedCases"].sum(),df_train.loc[df_train["Date"]=="2020-05-13"]["Fatalities"].sum()]
fig = go.Figure(data=[go.Pie(labels=Total["Index"], values=Total["Values"],hole=0)])
fig.show()
d1=df_train.loc[df_train["Date"]=="2020-05-13"]
Tunisia_df=d1.loc[d1["Country_Region"]=="Tunisia"]
Tunisia = pd.DataFrame()
Tunisia["Index"] = ["Confirmed Cases","Fatalities"]
Tunisia["Values"] = [Tunisia_df["ConfirmedCases"].values[-1],Tunisia_df["Fatalities"].values[-1]]
fig = go.Figure(data=[go.Pie(labels=Tunisia["Index"], values=Tunisia["Values"],hole=0.1)])
fig.show()
# Visualize tunisia dataframe
df_train[df_train["Country_Region"]=="Tunisia"]
fig_tun_fatal = px.line(df_train[df_train["Country_Region"]=="Tunisia"], x="Date", y="Fatalities", title='Tunisia Covid-19 Fatalities')
fig.update_layout(barmode='group',
xaxis_title=" Date ",
yaxis_title=" Fatalities ",)
fig_tun_fatal.show()
fig_tun_confirmed = px.line(df_train[df_train["Country_Region"]=="Tunisia"], x="Date", y="ConfirmedCases", title='Tunisia Covid-19 confirmed cases')
fig.update_layout(
xaxis_title=" Date ",
yaxis_title=" Confirmed Cases",)
fig_tun_confirmed.show()
fig = go.Figure()
fig.add_trace(go.Scatter(
y=df_train[df_train["Country_Region"]=="Tunisia"]["ConfirmedCases"],
x=df_train[df_train["Country_Region"]=="Tunisia"]["Date"],
name = 'ConfirmedCases',
connectgaps=True
))
fig.add_trace(go.Scatter(
y=df_train[df_train["Country_Region"]=="Tunisia"]["Fatalities"],
x=df_train[df_train["Country_Region"]=="Tunisia"]["Date"],
name='Fatalities',
))
fig.update_layout(title=' ConfirmedCases & Fatalities Covid-19 in Tunisia', xaxis_title=" Date ",yaxis_title=" Confirmed Cases & Fatalities",)
fig.show()
df_train[df_train["Country_Region"]=="US"]
sort=df_train[df_train["Country_Region"]=="US"].sort_values(by=["ConfirmedCases"],ascending=False)[:400]
sort_fat=df_train[df_train["Country_Region"]=="US"].sort_values(by=["Fatalities"],ascending=False)[:400]
fig = px.line(sort, x="Date", y="ConfirmedCases",color='Province_State', title='US confirmed cases by state')
fig.update_layout( xaxis_title=" Date ",yaxis_title=" Confirmed Cases",)
fig.show()
fig_fat = px.line(sort_fat, x="Date", y="Fatalities",color='Province_State', title='US Fatalities cases by state')
fig.update_layout(xaxis_title=" Date ",yaxis_title="Fatalities",)
fig_fat.show()
fig = go.Figure()
fig.add_trace(go.Scatter(
y=df_train[df_train["Country_Region"]=="US"].fillna('NA').groupby(['Date'])["ConfirmedCases"].sum(),
x=df_train[df_train["Country_Region"]=="US"]["Date"],
name = 'ConfirmedCases',
connectgaps=True
))
fig.add_trace(go.Scatter(
y=df_train[df_train["Country_Region"]=="US"].fillna('NA').groupby(['Date'])['Fatalities'].sum(),
x=df_train[df_train["Country_Region"]=="US"]["Date"],
name='Fatalities',
))
fig.update_layout(title=' ConfirmedCases & Fatalities in USA')
fig.show()
fig = go.Figure()
fig.add_trace(go.Scatter(
y=df_train[df_train["Country_Region"]=="Italy"]["ConfirmedCases"],
x=df_train[df_train["Country_Region"]=="Italy"]["Date"],
name = 'ConfirmedCases',
connectgaps=True
))
fig.add_trace(go.Scatter(
y=df_train[df_train["Country_Region"]=="Italy"]["Fatalities"],
x=df_train[df_train["Country_Region"]=="Italy"]["Date"],
name='Fatalities',
))
fig.update_layout(title=' ConfirmedCases & Fatalities in Italy')
fig.show()
fig = go.Figure()
fig.add_trace(go.Scatter(
y=df_train[df_train["Country_Region"]=="France"].fillna('NA').groupby(['Date'])["ConfirmedCases"].sum(),
x=df_train[df_train["Country_Region"]=="France"]["Date"],
name = 'ConfirmedCases',
connectgaps=True
))
fig.add_trace(go.Scatter(
y=df_train[df_train["Country_Region"]=="France"].fillna('NA').groupby(['Date'])['Fatalities'].sum(),
x=df_train[df_train["Country_Region"]=="France"]["Date"],
name='Fatalities',
))
fig.update_layout(title=' ConfirmedCases & Fatalities in France')
fig.show()
country_df = df_train.groupby(['Date', 'Country_Region'])[['ConfirmedCases', 'Fatalities']].sum().reset_index()
country_df.tail()
data = (
df_train.groupby(["Country_Region","Date"])
.agg({"ConfirmedCases": "sum", "Fatalities": "sum"})
.reset_index()
)
test_data = (
df_test.groupby(["Date", "Country_Region"])
.last()
.reset_index()[["Date", "Country_Region"]]
)
data["Date"] = pd.to_datetime(data.Date)
test_data["Date"] = pd.to_datetime(test_data.Date)
countries = data["Country_Region"].unique()
test_countries = test_data["Country_Region"].unique()
df_train1 = df_train.fillna('NA').groupby(['Country_Region','Date']).sum()
df_train1
df_train1.reset_index(inplace=True)
df_train1
import plotly.graph_objects as go
from plotly.offline import iplot
for i in range(1, len(countries)):
_data = df_train1[df_train1["Country_Region"] == countries[i - 1]]
trace1 = go.Scatter(
x=_data.Date,
y=_data.ConfirmedCases,
name= "Confirmed Cases"
)
trace2 = go.Scatter(
x=_data.Date,
y=_data.Fatalities,
name="Confirmed Fatalities"
)
data1 = [trace1, trace2]
layout = go.Layout(title = countries[i - 1], xaxis = {'title':'Date'}, yaxis = {'title':'value'})
fig = go.Figure(data=data1,layout=layout)
iplot(fig)
last_date = df_train.Date.max()
df_countries = df_train[df_train['Date']==last_date]
df_countries = df_countries.groupby('Country_Region', as_index=False)['ConfirmedCases','Fatalities'].sum()
df_countries = df_countries.nlargest(10,'ConfirmedCases')
#Get the trend for top 10 countries
df_trend = df_train.groupby(['Date','Country_Region'], as_index=False)['ConfirmedCases','Fatalities'].sum()
df_trend = df_trend.merge(df_countries, on='Country_Region')
df_trend.drop(['ConfirmedCases_y','Fatalities_y'],axis=1, inplace=True)
df_trend.rename(columns={'Country_Region':'Country', 'ConfirmedCases_x':'Cases', 'Fatalities_x':'Deaths'}, inplace=True)
#Add columns for studying logarithmic trends
df_trend['ln(Cases)'] = np.log(df_trend['Cases']+1)# Added 1 to remove error due to log(0).
df_trend['ln(Deaths)'] = np.log(df_trend['Deaths']+1)
px.line(df_trend, x='Date', y='Cases', color='Country', title='COVID19 Cases growth for top 10 worst affected countries')
px.line(df_trend, x='Date', y='Deaths', color='Country', title='COVID19 Deaths growth for top 10 worst affected countries')
px.line(df_trend, x='Date', y='ln(Cases)', color='Country', title='COVID19 Cases growth for top 10 worst affected countries(Logarithmic Scale)')
px.line(df_trend, x='Date', y='ln(Deaths)', color='Country', title='COVID19 Deaths growth for top 10 worst affected countries(Logarithmic Scale)')
df_map = df_train.copy()
df_map['Date'] = df_map['Date'].astype(str)
df_map = df_map.groupby(['Date','Country_Region'], as_index=False)['ConfirmedCases','Fatalities'].sum()
def get_iso3_util(country_name):
try:
country = pycountry.countries.get(name=country_name)
return country.alpha_3
except:
if 'Congo' in country_name:
country_name = 'Congo'
elif country_name == 'Diamond Princess' or country_name == 'Laos' or country_name == 'MS Zaandam':
return country_name
elif country_name == 'Korea, South':
country_name = 'Korea, Republic of'
elif country_name == 'Taiwan*':
country_name = 'Taiwan'
elif country_name == 'Burma':
country_name = 'Myanmar'
elif country_name == 'West Bank and Gaza':
country_name = 'Gaza'
country = pycountry.countries.search_fuzzy(country_name)
return country[0].alpha_3
d = {}
def get_iso3(country):
if country in d:
return d[country]
else:
d[country] = get_iso3_util(country)
df_map['iso_alpha'] = df_map.apply(lambda x: get_iso3(x['Country_Region']), axis=1)
df_map['ln(ConfirmedCases)'] = np.log(df_map.ConfirmedCases + 1)
df_map['ln(Fatalities)'] = np.log(df_map.Fatalities + 1)
px.choropleth(df_map,
locations="iso_alpha",
color="ln(ConfirmedCases)",
hover_name="Country_Region",
hover_data=["ConfirmedCases"] ,
animation_frame="Date",
color_continuous_scale=px.colors.sequential.dense,
title='Confirmed Cases growth(Logarithmic Scale)')
df_map['Mortality Rate%'] = round((df_map.Fatalities/df_map.ConfirmedCases)*100,2)
country_df['Date'] = country_df['Date'].apply(str)
fig = px.scatter_geo(country_df, locations="Country_Region", locationmode='country names',
color="ConfirmedCases", size='ConfirmedCases', hover_name="Country_Region",
hover_data=['ConfirmedCases', 'Fatalities'],
range_color= [0, top_count['ConfirmedCases'].max()],
projection="natural earth", animation_frame="Date",
title='COVID-19: Confirmed cases spread Over Time', color_continuous_scale="portland" , size_max=80)
fig.show()
px.choropleth(df_map,
locations="iso_alpha",
color="ln(Fatalities)",
hover_name="Country_Region",
hover_data=["Fatalities"],
animation_frame="Date",
color_continuous_scale=px.colors.sequential.OrRd,
title = 'Deaths growth(Logarithmic Scale)')
px.choropleth(df_map,
locations="iso_alpha",
color="Mortality Rate%",
hover_name="Country_Region",
hover_data=["ConfirmedCases","Fatalities"],
animation_frame="Date",
color_continuous_scale=px.colors.sequential.Magma_r,
title = 'Worldwide Daily Variation of Mortality Rat')
df_trend['Mortality Rate%'] = round((df_trend.Deaths/df_trend.Cases)*100,2)
px.line(df_trend, x='Date', y='Mortality Rate%', color='Country', title='Variation of Mortality Rate% \n(Top 10 worst affected countries)')